Load in packages

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5     ✓ purrr   0.3.4
✓ tibble  3.1.4     ✓ dplyr   1.0.7
✓ tidyr   1.1.3     ✓ stringr 1.4.0
✓ readr   2.0.1     ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(tidyverse)
library(ggplot2)

import data:

find missing data:

No missing data. Check data types of each variable:

str(house)
'data.frame':   21613 obs. of  22 variables:
 $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
 $ date         : chr  "20141013T000000" "20141209T000000" "20150225T000000" "20141209T000000" ...
 $ price        : num  221900 538000 180000 604000 510000 ...
 $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
 $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
 $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
 $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
 $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
 $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
 $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
 $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
 $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
 $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
 $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
 $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
 $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
 $ long         : num  -122 -122 -122 -122 -122 ...
 $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
 $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
 $ num          : int  1 1 1 1 1 1 1 1 1 1 ...

We will definitely need to change the data type for the date column, and potentially look into creating factors for some of the more ordinal variables.

Convert date variabe to date type:

Turning view, condition, and grade into ordered factors:

Part 2: EDA

library(gridExtra)

Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine
names(house)
 [1] "id"            "date"          "price"         "bedrooms"      "bathrooms"     "sqft_living"   "sqft_lot"     
 [8] "floors"        "waterfront"    "view"          "condition"     "grade"         "sqft_above"    "sqft_basement"
[15] "yr_built"      "yr_renovated"  "zipcode"       "lat"           "long"          "sqft_living15" "sqft_lot15"   
[22] "num"          

Prior to dropping Date and Geotags consider using them for plotting, for example transaction counts by dates?

names(house)
 [1] "price"         "bedrooms"      "bathrooms"     "sqft_living"   "sqft_lot"      "floors"        "waterfront"   
 [8] "view"          "condition"     "grade"         "sqft_above"    "sqft_basement" "yr_built"      "yr_renovated" 
[15] "sqft_living15" "sqft_lot15"   
describe(house)
house 

 16  Variables      21613  Observations
-----------------------------------------------------------------------------------------------------------------------------
price 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0     3625        1   540182   329562   210000   245000   321950   450000   645000   887000  1160000 

lowest :   75000   78000   80000   81000   82000, highest: 5350000 5570000 6890000 7060000 7700000
-----------------------------------------------------------------------------------------------------------------------------
bedrooms 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0       13    0.871    3.371    0.946        2        2        3        3        4        4        5 

lowest :  0  1  2  3  4, highest:  8  9 10 11 33
                                                                                        
Value          0     1     2     3     4     5     6     7     8     9    10    11    33
Frequency     13   199  2760  9824  6882  1601   272    38    13     6     3     1     1
Proportion 0.001 0.009 0.128 0.455 0.318 0.074 0.013 0.002 0.001 0.000 0.000 0.000 0.000
-----------------------------------------------------------------------------------------------------------------------------
bathrooms 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0       30    0.974    2.115   0.8444     1.00     1.00     1.75     2.25     2.50     3.00     3.50 

lowest : 0.00 0.50 0.75 1.00 1.25, highest: 6.50 6.75 7.50 7.75 8.00
-----------------------------------------------------------------------------------------------------------------------------
sqft_living 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0     1038        1     2080    978.4      940     1090     1427     1910     2550     3250     3760 

lowest :   290   370   380   384   390, highest:  9640  9890 10040 12050 13540
-----------------------------------------------------------------------------------------------------------------------------
sqft_lot 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0     9782        1    15107    17855     1800     3322     5040     7618    10688    21398    43339 

lowest :     520     572     600     609     635, highest:  982998 1024068 1074218 1164794 1651359
-----------------------------------------------------------------------------------------------------------------------------
floors 
       n  missing distinct     Info     Mean      Gmd 
   21613        0        6    0.823    1.494   0.5563 

lowest : 1.0 1.5 2.0 2.5 3.0, highest: 1.5 2.0 2.5 3.0 3.5
                                              
Value        1.0   1.5   2.0   2.5   3.0   3.5
Frequency  10680  1910  8241   161   613     8
Proportion 0.494 0.088 0.381 0.007 0.028 0.000
-----------------------------------------------------------------------------------------------------------------------------
waterfront 
       n  missing distinct 
   21613        0        2 
                      
Value          0     1
Frequency  21450   163
Proportion 0.992 0.008
-----------------------------------------------------------------------------------------------------------------------------
view 
       n  missing distinct 
   21613        0        5 

lowest : 0 1 2 3 4, highest: 0 1 2 3 4
                                        
Value          0     1     2     3     4
Frequency  19489   332   963   510   319
Proportion 0.902 0.015 0.045 0.024 0.015
-----------------------------------------------------------------------------------------------------------------------------
condition 
       n  missing distinct 
   21613        0        5 

lowest : 1 2 3 4 5, highest: 1 2 3 4 5
                                        
Value          1     2     3     4     5
Frequency     30   172 14031  5679  1701
Proportion 0.001 0.008 0.649 0.263 0.079
-----------------------------------------------------------------------------------------------------------------------------
grade 
       n  missing distinct 
   21613        0       12 

lowest : 1  3  4  5  6 , highest: 9  10 11 12 13
                                                                                  
Value          1     3     4     5     6     7     8     9    10    11    12    13
Frequency      1     3    29   242  2038  8981  6068  2615  1134   399    90    13
Proportion 0.000 0.000 0.001 0.011 0.094 0.416 0.281 0.121 0.052 0.018 0.004 0.001
-----------------------------------------------------------------------------------------------------------------------------
sqft_above 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0      946        1     1788    876.2      850      970     1190     1560     2210     2950     3400 

lowest :  290  370  380  384  390, highest: 7880 8020 8570 8860 9410
-----------------------------------------------------------------------------------------------------------------------------
sqft_basement 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0      306    0.776    291.5    422.2        0        0        0        0      560      970     1190 

lowest :    0   10   20   40   50, highest: 3260 3480 3500 4130 4820
-----------------------------------------------------------------------------------------------------------------------------
yr_built 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0      116        1     1971    33.38     1915     1926     1951     1975     1997     2007     2011 

lowest : 1900 1901 1902 1903 1904, highest: 2011 2012 2013 2014 2015
-----------------------------------------------------------------------------------------------------------------------------
yr_renovated 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0       70    0.122     84.4    161.7        0        0        0        0        0        0        0 

lowest :    0 1934 1940 1944 1945, highest: 2011 2012 2013 2014 2015
                                                                                                                      
Value          0  1935  1940  1945  1950  1955  1960  1965  1970  1975  1980  1985  1990  1995  2000  2005  2010  2015
Frequency  20699     1     2     6     4    13    12    16    27    25    43    88    99    84   112   156    82   144
Proportion 0.958 0.000 0.000 0.000 0.000 0.001 0.001 0.001 0.001 0.001 0.002 0.004 0.005 0.004 0.005 0.007 0.004 0.007

For the frequency table, variable is rounded to the nearest 5
-----------------------------------------------------------------------------------------------------------------------------
sqft_living15 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0      777        1     1987    743.2     1140     1256     1490     1840     2360     2930     3300 

lowest :  399  460  620  670  690, highest: 5600 5610 5790 6110 6210
-----------------------------------------------------------------------------------------------------------------------------
sqft_lot15 
       n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
   21613        0     8689        1    12768    13404     1999     3667     5100     7620    10083    17852    37063 

lowest :    651    659    660    748    750, highest: 434728 438213 560617 858132 871200
-----------------------------------------------------------------------------------------------------------------------------

Summary plots:

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Question: how to deal with indicator (ordinary) varibales in this case? Map to binary classes:

Map to binary classes and check distributions and interactions

Checking possible interactions after mapping categorical variables to a larger classes

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Final check, same scatter plots but with log(price) - no visiable interaction with log price.

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Quantitative pridictors:

Checking how many quantitative observations have 0 values

colSums(house[,quant_vars] == 0)
     yr_built  yr_renovated        floors      bedrooms     bathrooms   sqft_living      sqft_lot    sqft_above 
            0         20699             0            13            10             0             0             0 
sqft_basement sqft_living15    sqft_lot15 
        13126             0             0 

Probably some homes have no basements and thus zeros sqft_basement is okay, but all homes are expected to have non-zero number of bedrooms (13 zeros) and bathrooms (10 zeroz). Drop these rows:

colSums(house[,quant_vars] == 0)
     yr_built  yr_renovated        floors      bedrooms     bathrooms   sqft_living      sqft_lot    sqft_above 
            0         20683             0             0             0             0             0             0 
sqft_basement sqft_living15    sqft_lot15 
        13110             0             0 

Converting quantitative predictor floors to a factor 1, 2, 3.

grid.arrange(sp_floors, bp_floors, ncol = 2, nrow = 1)
`geom_smooth()` using formula 'y ~ x'

Computing age of the house and removing year_build and year_renovated

names(house)
 [1] "price"         "bedrooms"      "bathrooms"     "sqft_living"   "sqft_lot"      "floors"        "waterfront"   
 [8] "view"          "condition"     "grade"         "sqft_above"    "sqft_basement" "sqft_living15" "sqft_lot15"   
[15] "age"          

Final set of quantitative vars:

hist.data.frame(house[,quant_vars])
click left mouse button to proceed

Correlations of quantitative vars:

ggcorrplot(corr, 
           method = "circle", 
           lab = TRUE,
          # type = "lower", 
           outline.color = "white", 
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"))
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

TODO: interpreting models https://cran.r-project.org/web/packages/jtools/vignettes/summ.html

summary(fit)

Call:
lm(formula = log(price) ~ ., data = house)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.60681 -0.23786  0.01236  0.23454  1.77476 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    1.199e+01  1.965e-02 610.122  < 2e-16 ***
bedrooms      -4.121e-02  3.216e-03 -12.813  < 2e-16 ***
bathrooms      1.083e-01  5.541e-03  19.537  < 2e-16 ***
sqft_living    2.795e-04  7.204e-06  38.797  < 2e-16 ***
sqft_lot       1.918e-07  8.132e-08   2.359   0.0184 *  
floors2        7.979e-02  7.275e-03  10.967  < 2e-16 ***
floors3        3.601e-01  1.566e-02  22.990  < 2e-16 ***
waterfront.L   2.996e-01  1.981e-02  15.126  < 2e-16 ***
view1          1.477e-01  8.742e-03  16.895  < 2e-16 ***
condition1     3.170e-02  5.499e-03   5.764 8.30e-09 ***
grade1         8.806e-02  5.366e-03  16.411  < 2e-16 ***
sqft_above    -3.194e-05  7.208e-06  -4.431 9.41e-06 ***
sqft_basement         NA         NA      NA       NA    
sqft_living15  1.754e-04  5.527e-06  31.724  < 2e-16 ***
sqft_lot15    -8.582e-07  1.243e-07  -6.905 5.17e-12 ***
age            4.169e-03  1.140e-04  36.583  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3426 on 21582 degrees of freedom
Multiple R-squared:  0.5769,    Adjusted R-squared:  0.5766 
F-statistic:  2102 on 14 and 21582 DF,  p-value: < 2.2e-16

summary(fit_les.sqft_basement)

Call:
lm(formula = log(price) ~ . - sqft_basement, data = house)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.60681 -0.23786  0.01236  0.23454  1.77476 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    1.199e+01  1.965e-02 610.122  < 2e-16 ***
bedrooms      -4.121e-02  3.216e-03 -12.813  < 2e-16 ***
bathrooms      1.083e-01  5.541e-03  19.537  < 2e-16 ***
sqft_living    2.795e-04  7.204e-06  38.797  < 2e-16 ***
sqft_lot       1.918e-07  8.132e-08   2.359   0.0184 *  
floors2        7.979e-02  7.275e-03  10.967  < 2e-16 ***
floors3        3.601e-01  1.566e-02  22.990  < 2e-16 ***
waterfront.L   2.996e-01  1.981e-02  15.126  < 2e-16 ***
view1          1.477e-01  8.742e-03  16.895  < 2e-16 ***
condition1     3.170e-02  5.499e-03   5.764 8.30e-09 ***
grade1         8.806e-02  5.366e-03  16.411  < 2e-16 ***
sqft_above    -3.194e-05  7.208e-06  -4.431 9.41e-06 ***
sqft_living15  1.754e-04  5.527e-06  31.724  < 2e-16 ***
sqft_lot15    -8.582e-07  1.243e-07  -6.905 5.17e-12 ***
age            4.169e-03  1.140e-04  36.583  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3426 on 21582 degrees of freedom
Multiple R-squared:  0.5769,    Adjusted R-squared:  0.5766 
F-statistic:  2102 on 14 and 21582 DF,  p-value: < 2.2e-16

Questions: floors 1, 2, 3 as factors?

N/A in sqft_basement

Will removing outliers help with Residuals/Fitted values

str(house)
'data.frame':   21597 obs. of  15 variables:
 $ price        : num  221900 538000 180000 604000 510000 ...
 $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
 $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
 $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
 $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
 $ floors       : Factor w/ 3 levels "1","2","3": 1 2 1 1 1 1 2 1 1 2 ...
 $ waterfront   : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
 $ view         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ condition    : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
 $ grade        : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 1 1 ...
 $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
 $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
 $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
 $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
 $ age          : num  66 30 88 56 34 20 26 58 61 18 ...
---
title: 'STAT 6021: Project 2'
author: "Connie Cui"
date: "11/26/2021"
output:
  html_document:
    df_print: paged
  html_notebook: default
---


Load in packages
```{r}
library(tidyverse)
library(ggplot2)
```
import data:
```{r}
house <- read.csv("house_data.csv")
head(house)
```
find missing data:
```{r}
# list rows of data that have missing values
house[!complete.cases(house),]
```
No missing data.
Check data types of each variable:
```{r}
str(house)
```
We will definitely need to change the data type for the date column, and potentially look into creating factors for some of the more ordinal variables.
```{r}
house$date = substr(house$date,1,nchar(house$date)-7)
head(house)
```
Convert date variabe to date type:
```{r}
house$date <- as.Date(house$date, "%Y%m%d")
head(house)
```
Turning view, condition, and grade into ordered factors:
```{r}
house$view <- factor(house$view, ordered = TRUE, levels = c(0, 1, 2, 3, 4))
house$condition <- factor(house$condition, ordered = TRUE, levels = c(1, 2, 3, 4, 5))
house$grade <- factor(house$grade, ordered = TRUE, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
house$waterfront <- factor(house$waterfront, ordered = TRUE, levels = c(0, 1))
```


## Part 2: EDA

```{r}
#install.packages("ggcorrplot")
#install.packages("miscset")
#library(miscset)
#library(Hmisc)
library(tidyverse)
library(dplyr)
library(faraway)
library(gridExtra)
```

```{r}
names(house)
```

#### Prior to dropping Date and Geotags consider using them for plotting, for example transaction counts by dates?


```{r}
house <- subset(house, select=-c(id,num, date, zipcode, lat, long))
names(house)
```

```{r}
#describe(house)
```


Summary plots:
```{r}
sp1 <- ggplot(house, aes(x=sqft_living, y=price, color=waterfront))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with waterfornt indicator")

sp2 <- ggplot(house, aes(x=sqft_living, y=price, color=view))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with view indicator")
  
  
sp3 <- ggplot(house, aes(x=sqft_living, y=price, color=condition))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with condition indicator")
  
  
  
sp4 <- ggplot(house, aes(x=sqft_living, y=price, color=grade))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with grade indicator")

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
```


#### Question: how to deal with indicator (ordinary) varibales in this case? Map to binary classes:

```{r}
cat_vars = c("waterfront", "view", "condition", "grade")
```

```{r}
ggplotGrid(ncol = 2,
  lapply(c("view", "waterfront", "condition", "grade"),
    function(col) {
        ggplot(house, aes_string(col)) + geom_bar() + coord_flip()
    }))
```

```{r}
bp1 <- ggplot(house, aes(x=waterfront, y=price))+
geom_boxplot()+
labs(x="waterfront", y="price", title="Price by waterfront")

bp2 <- ggplot(house, aes(x=view, y=price))+
geom_boxplot()+
labs(x="view", y="price", title="Price by view")

bp3 <- ggplot(house, aes(x=condition, y=price))+
geom_boxplot()+
labs(x="condition", y="price", title="Price by condition")

bp4 <- ggplot(house, aes(x=grade, y=price))+
geom_boxplot()+
labs(x="grade", y="price", title="Price by grade")

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(bp1, bp2, bp3, bp4, ncol = 2, nrow = 2)
```

#### Map to binary classes and check distributions and interactions

```{r}
# Changing `view` to 0 for regular view and 1 for every other view
house$view <- factor(ifelse(house$view!=0, 1, 0))
# Changing `condition` to 0 for everything below 3 and 1 otherwise
house$condition <- factor(ifelse(house$condition==1 | house$condition==2 | house$condition==3, 0, 1))
# Changing `grade` to 0 for everything below 7 and 1 otherwise
house$grade <- factor(ifelse(house$grade==1 | house$grade==2 | house$grade==3 |
                      house$grade==4 | house$grade==5 | house$grade==7 , 0, 1))
```


```{r}
ggplotGrid(ncol = 2,
  lapply(c("view", "waterfront", "condition", "grade"),
    function(col) {
        ggplot(house, aes_string(col)) + geom_bar() + coord_flip()
    }))
```

#### Checking possible interactions after mapping categorical variables to a larger classes


```{r}
sp1 <- ggplot(house, aes(x=sqft_living, y=price, color=waterfront))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with waterfornt indicator")

sp2 <- ggplot(house, aes(x=sqft_living, y=price, color=view))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with view indicator")
  
sp3 <- ggplot(house, aes(x=sqft_living, y=price, color=condition))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with condition indicator")
  
sp4 <- ggplot(house, aes(x=sqft_living, y=price, color=grade))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with grade indicator")

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)

```

#### Final check, same scatter plots but with log(price) - no visiable interaction with log price.

```{r}
sp1 <- ggplot(house, aes(x=sqft_living, y=log(price), color=waterfront))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with waterfornt indicator")

sp2 <- ggplot(house, aes(x=sqft_living, y=log(price), color=view))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with view indicator")
  
  
sp3 <- ggplot(house, aes(x=sqft_living, y=log(price), color=condition))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with condition indicator")
  
sp4 <- ggplot(house, aes(x=sqft_living, y=log(price), color=grade))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with grade indicator")

##produce the 4 density plots in a 2 by 2 matrix
grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)

```

#### Quantitative pridictors:

```{r}

quant_vars = c("yr_built", "yr_renovated",
               "floors", "bedrooms", "bathrooms", 
               "sqft_living", "sqft_lot", "sqft_above", "sqft_basement", 
               "sqft_living15", "sqft_lot15")

library(Hmisc)
hist.data.frame(house[,quant_vars])

```

#### Checking how many quantitative observations have 0 values

```{r}
colSums(house[,quant_vars] == 0)
```


#### Probably some homes have no basements and thus zeros sqft_basement is okay, but all homes are expected to have non-zero number of bedrooms (13 zeros) and bathrooms (10 zeroz). Drop these rows:


```{r}
house <- filter(house, bathrooms != 0, bedrooms != 0)
colSums(house[,quant_vars] == 0)
```


#### Converting quantitative predictor floors to a factor 1, 2, 3.


```{r}
house$floors <- factor(ifelse(house$floors < 2, 1, ifelse(house$floors < 3, 2, ifelse(house$floors>=3, 3, 0))))

sp_floors <- ggplot(house, aes(x=sqft_living, y=price, color=floors))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(x="sqft_living", 
       y="price",
       title="Scatter plot of price against sqft_living with floors indicator")

bp_floors <- ggplot(house, aes(x=floors, y=price))+
  geom_boxplot()+
  labs(x="floors", y="price", title="Price by number of floors")

grid.arrange(sp_floors, bp_floors, ncol = 2, nrow = 1)
```


#### Computing age of the house and removing year_build and year_renovated

```{r}
house$age = ifelse(2021-house$yr_renovated >= 2021-house$yr_built, 2021-house$yr_built, 2021-house$yr_renovated)
head(house)
```

```{r}
house <- subset(house, select=-c(yr_renovated, yr_built))
names(house)
```


#### Final set of quantitative vars:

```{r}
quant_vars = c("age", "bedrooms", "bathrooms", 
               "sqft_living", "sqft_lot", "sqft_above", "sqft_basement", 
               "sqft_living15", "sqft_lot15")

hist.data.frame(house[,quant_vars])

```
               
#### Correlations of quantitative vars:

```{r}
corr <- round(cor(house[,c("price",quant_vars)]), 1)
library(ggcorrplot)
ggcorrplot(corr, 
           method = "circle", 
           lab = TRUE,
          # type = "lower", 
           outline.color = "white", 
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"))
```





## TODO: interpreting models https://cran.r-project.org/web/packages/jtools/vignettes/summ.html

```{r}
fit <- lm(log(price) ~ . , data = house)
summary(fit)
```

```{r}
plot(fit)
```


```{r}
fit_les.sqft_basement <- lm(log(price) ~ . - sqft_basement, data = house)
summary(fit_les.sqft_basement)
```

```{r}
plot(fit_les.sqft_basement)
```

# Questions: floors 1, 2, 3 as factors?
# N/A in sqft_basement
# Will removing outliers help with Residuals/Fitted values

```{r}
str(house)
```








